import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
df1 = pd.read_csv("Doctorate_students_2022-04-26-ta_resdig_sctech_rdperes_perf__tsc00028.csv")
df1 = df1.dropna()
df1_spain = df1[df1.geo == "Spain"]
df1_spain
| time | _geo | geo | value | unit | _unit | isced97 | _isced97 | field | _field | sex | _sex | _flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 92 | 2005 | ES | Spain | 0.2 | Percentage of total population aged 20-29 | PC_Y20-29 | Second stage of tertiary education leading to ... | ED6 | Science, mathematics and computing, engineerin... | EF4_5 | Women | W | b |
| 394 | 2005 | ES | Spain | 0.3 | Percentage of total population aged 20-29 | PC_Y20-29 | Second stage of tertiary education leading to ... | ED6 | Science, mathematics and computing, engineerin... | EF4_5 | Men | M | b |
| 699 | 2005 | ES | Spain | 0.3 | Percentage of total population aged 20-29 | PC_Y20-29 | Second stage of tertiary education leading to ... | ED6 | Science, mathematics and computing, engineerin... | EF4_5 | Total | T | b |
import plotly.express as px
fig = px.scatter(df1_spain, x="value", y="unit", animation_frame="time", animation_group="geo",
size="value", color="sex", hover_name="geo", facet_col="geo",
size_max=45)
fig.show()
df_prop_gender_sci = pd.read_csv("prop_gender_science.csv")
df_prop_gender_sci
| time | _geo | geo | value | sex | _sex | unit | _unit | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2012 | AT | Austria | 7.5 | Men | M | Percentage | PC |
| 1 | 2012 | AT | Austria | 12.2 | Women | W | Percentage | PC |
| 2 | 2012 | BE | Belgium | 9.5 | Men | M | Percentage | PC |
| 3 | 2012 | BE | Belgium | 14.7 | Women | W | Percentage | PC |
| 4 | 2012 | BG | Bulgaria | 2.4 | Men | M | Percentage | PC |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 127 | 2016 | SI | Slovenia | 7.1 | Women | W | Percentage | PC |
| 128 | 2016 | SK | Slovakia | 9.3 | Men | M | Percentage | PC |
| 129 | 2016 | SK | Slovakia | 12.0 | Women | W | Percentage | PC |
| 130 | 2016 | UK | United Kingdom | 1.8 | Men | M | Percentage | PC |
| 131 | 2016 | UK | United Kingdom | 5.1 | Women | W | Percentage | PC |
132 rows × 8 columns
import plotly.express as px
fig = px.scatter(df_prop_gender_sci, x="geo", y="value", facet_col="time",
size="value", color="sex", color_discrete_sequence=["#EF7949", "#83539D"])
fig.show()
df_total_personnel_in_rd = pd.read_csv("total_personnel_science_in_rd_by_gender.csv")
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.unit == "Full-time equivalent (FTE)"]
df_total_personnel_in_rd = df_total_personnel_in_rd.dropna(subset=["value"])
df_total_personnel_in_rd = df_total_personnel_in_rd.sort_values(by='time')
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.sectperf == "All sectors"]
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.prof_pos == "Total"]
df_total_personnel_in_rd
| time | _geo | geo | value | sectperf | _sectperf | prof_pos | _prof_pos | sex | _sex | unit | _unit | _flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53130 | 1980 | HU | Hungary | 25589.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| 52976 | 1980 | ES | Spain | 35496.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| 52708 | 1981 | BE | Belgium | 32531.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | e |
| 53131 | 1981 | HU | Hungary | 51512.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | d |
| 53211 | 1981 | IS | Iceland | 744.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53376 | 2020 | MK | North Macedonia | 2029.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| 53467 | 2020 | NO | Norway | 48947.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| 53494 | 2020 | PL | Poland | 173392.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN |
| 53435 | 2020 | NL | Netherlands | 161564.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | p |
| 53395 | 2020 | MT | Malta | 1807.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | p |
1562 rows × 13 columns
df_total_personnel_in_rd.geo.unique(), df_total_personnel_in_rd.sex.unique()
(array(['Hungary', 'Spain', 'Belgium', 'Iceland', 'Sweden', 'Finland',
'United Kingdom', 'Italy', 'Netherlands', 'Germany', 'Norway',
'Denmark', 'Austria', 'France', 'Ireland', 'Portugal',
'Switzerland', 'Greece', 'Turkey', 'Cyprus', 'Slovenia',
'Bulgaria', 'Romania', 'Latvia', 'Russia', 'Slovakia', 'Poland',
'Czechia', 'Lithuania', 'Estonia', 'Malta',
'Euro area (19 countries)', 'Luxembourg',
'European Union - 27 countries (from 2020)', 'Croatia',
'North Macedonia', 'Serbia', 'Montenegro',
'Bosnia and Herzegovina'], dtype=object),
array(['Total', 'Women'], dtype=object))
def filter_by_if_year_has_total_and_women_get_percentage(df_total_personnel_in_rd, country_name):
df_total_personnel_in_rd_filt = df_total_personnel_in_rd[df_total_personnel_in_rd.geo == country_name].sort_values("time")
years_with_data = df_total_personnel_in_rd_filt.time.value_counts() > 1
years_with_data = years_with_data.index[years_with_data]
#print(df_total_personnel_in_rd_filt.tail(20).values)
df_total_personnel_in_rd_filt = df_total_personnel_in_rd_filt[df_total_personnel_in_rd_filt.time.isin(years_with_data)]
total_year = {}
for row in df_total_personnel_in_rd_filt.values:
if row[8] == "Total":
total_year[row[0]] = row[3]
print(country_name)
if df_total_personnel_in_rd_filt.empty:
print(df_total_personnel_in_rd_filt, years_with_data)
return
df_total_personnel_in_rd_filt["percent"] = df_total_personnel_in_rd_filt.apply(lambda x: x["value"]/total_year[x["time"]], axis=1)
return df_total_personnel_in_rd_filt
df_cleaned = pd.DataFrame()
for country in df_total_personnel_in_rd.geo.unique():
df_country_cleaned = filter_by_if_year_has_total_and_women_get_percentage(df_total_personnel_in_rd, country)
df_cleaned = pd.concat([df_cleaned, df_country_cleaned])
df_cleaned
Hungary Spain Belgium Iceland Sweden Finland Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') United Kingdom Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Italy Netherlands Germany Norway Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Denmark Austria France Ireland Portugal Switzerland Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Greece Turkey Cyprus Slovenia Bulgaria Romania Latvia Russia Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Slovakia Poland Czechia Lithuania Estonia Malta Euro area (19 countries) Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Luxembourg European Union - 27 countries (from 2020) Empty DataFrame Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag] Index: [] Int64Index([], dtype='int64') Croatia North Macedonia Serbia Montenegro Bosnia and Herzegovina
| time | _geo | geo | value | sectperf | _sectperf | prof_pos | _prof_pos | sex | _sex | unit | _unit | _flag | percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53156 | 2006 | HU | Hungary | 25971.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN | 1.000000 |
| 51763 | 2006 | HU | Hungary | 10797.0 | All sectors | TOTAL | Total | TOTAL | Women | W | Full-time equivalent (FTE) | FTE | NaN | 0.415733 |
| 53157 | 2007 | HU | Hungary | 25954.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN | 1.000000 |
| 51764 | 2007 | HU | Hungary | 10504.0 | All sectors | TOTAL | Total | TOTAL | Women | W | Full-time equivalent (FTE) | FTE | NaN | 0.404716 |
| 51765 | 2008 | HU | Hungary | 10985.0 | All sectors | TOTAL | Total | TOTAL | Women | W | Full-time equivalent (FTE) | FTE | NaN | 0.400869 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 52705 | 2013 | BA | Bosnia and Herzegovina | 1399.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN | 1.000000 |
| 52706 | 2014 | BA | Bosnia and Herzegovina | 1767.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN | 1.000000 |
| 51577 | 2014 | BA | Bosnia and Herzegovina | 857.0 | All sectors | TOTAL | Total | TOTAL | Women | W | Full-time equivalent (FTE) | FTE | NaN | 0.485003 |
| 52707 | 2019 | BA | Bosnia and Herzegovina | 2037.0 | All sectors | TOTAL | Total | TOTAL | Total | T | Full-time equivalent (FTE) | FTE | NaN | 1.000000 |
| 51578 | 2019 | BA | Bosnia and Herzegovina | 1081.0 | All sectors | TOTAL | Total | TOTAL | Women | W | Full-time equivalent (FTE) | FTE | NaN | 0.530682 |
980 rows × 14 columns
country_dict = {"Bosnia and Herzegovina":"Bosnia y Herzegovina",
"Spain":"España",
"Belgium":"Bélgica",
"Cyprus":"Chipre",
"Czechia":"República Checa",
"France":"Francia",
"Germany":"Alemania",
"Denmark":"Dinamarca",
"Greece":"Grecia",
"Croatia":"Croacia",
"Hungary":"Hungaria",
"Ireland":"Irlanda",
"Iceland":"Islandia",
"Italy":"Italia",
"Lithuania":"Lituania",
"Luxembourg":"Luxemburgo",
"Latvia":"Letonia",
"North Macedonia":"Macedonia del Norte",
"Netherlands":"Paises Bajos",
"Poland":"Polonia",
"Romania":"Rumanía",
"Sweden":"Suecia",
"Norway":"Noruega",
"Slovenia":"Eslovenia",
"Slovakia":"Eslovaquia",
"Turkey":"Turquía",
"Switzerland":"Suiza"}
def convert_country_name(country_name):
translated_country = None
if country_name in country_dict.keys():
translated_country = country_dict[country_name]
else:
translated_country = country_name
return translated_country
df_cleaned.geo = df_cleaned.geo.apply(convert_country_name)
avg_women_percent_by_year = {}
for year in df_cleaned.time.unique():
avg_women_percent_by_year[year] = df_cleaned[(df_cleaned.sex=="Women") &
(df_cleaned.time==year)]["value"].sum() / \
df_cleaned[(df_cleaned.sex=="Total") &
(df_cleaned.time==year)]["value"].sum()
avg_women_percent_by_year
{2006: 0.364376751262983,
2007: 0.3238009195484552,
2008: 0.37567710405520155,
2009: 0.3318481072479871,
2010: 0.33348615226354544,
2011: 0.32244098509597036,
2012: 0.3422411713648949,
2013: 0.324851388660272,
2014: 0.3511998112391364,
2015: 0.3302487361180176,
2016: 0.3546968158407643,
2017: 0.3269165733273229,
2018: 0.35565858451639415,
2019: 0.3276984680329024,
1997: 0.26904255770150864,
1999: 0.28791480277447107,
2001: 0.3066278856677181,
2002: 0.36760359979190455,
2003: 0.3103814777719663,
2004: 0.36246515363898385,
2005: 0.3241359354453265,
2000: 0.4850806901276067,
1998: 0.4192304889601399,
2020: 0.3504186422865714,
1994: 0.4722347718478069,
1995: 0.4811903325407247,
1996: 0.4762487366349274,
1993: 0.5367495072826035}
fig = px.scatter(df_cleaned[(df_cleaned.sex=="Women") & (df_cleaned.time>2000)], x="geo", y="percent",
color="time", hover_name="sex", height=700,
labels={
"geo": "Paises",
"time": "Año",
"percent": "Porcentaje"
})
fig.write_html("women_europe_rd.html")
fig.show()
fig = px.scatter(df_cleaned[(df_cleaned.sex=="Women")
& (df_cleaned.time>2000) &
(df_cleaned.geo=="Spain")], x="time", y="percent", height=400, color="time",
labels={
"geo": "Paises",
"time": "Año",
"percent": "Porcentaje"
},
color_discrete_sequence=["#83539D", "#EF7949"])
fig.write_html("women_spain_rd.html")
fig.show()
df_investigators = pd.read_csv("carrera_investigadora.csv")
df_investigators.iloc[3, 0] = 'Tesis aprobadas'
df_investigators.iloc[1, 0] = 'Estudios de máster'
df_investigators
| Nivel de la carrera investigadora | Mujeres | Hombres | |
|---|---|---|---|
| 0 | Estudios de grado y primer y segundo ciclo | 55.2 | 44.8 |
| 1 | Estudios de máster | 54.4 | 45.5 |
| 2 | Esudios de doctorado | 50.0 | 50.0 |
| 3 | Tesis aprobadas | 49.8 | 50.2 |
| 4 | Grado D | 47.6 | 52.4 |
| 5 | Grado C | 50.0 | 50.0 |
| 6 | Grado B | 44.5 | 55.5 |
| 7 | Grado A | 24.1 | 75.9 |
df_investigators = df_investigators.melt(id_vars=["Nivel de la carrera investigadora"],
value_vars=["Mujeres", "Hombres"],
var_name="Sexo",
value_name="Porcentaje")
df_investigators
| Nivel de la carrera investigadora | Sexo | Porcentaje | |
|---|---|---|---|
| 0 | Estudios de grado y primer y segundo ciclo | Mujeres | 55.2 |
| 1 | Estudios de máster | Mujeres | 54.4 |
| 2 | Esudios de doctorado | Mujeres | 50.0 |
| 3 | Tesis aprobadas | Mujeres | 49.8 |
| 4 | Grado D | Mujeres | 47.6 |
| 5 | Grado C | Mujeres | 50.0 |
| 6 | Grado B | Mujeres | 44.5 |
| 7 | Grado A | Mujeres | 24.1 |
| 8 | Estudios de grado y primer y segundo ciclo | Hombres | 44.8 |
| 9 | Estudios de máster | Hombres | 45.5 |
| 10 | Esudios de doctorado | Hombres | 50.0 |
| 11 | Tesis aprobadas | Hombres | 50.2 |
| 12 | Grado D | Hombres | 52.4 |
| 13 | Grado C | Hombres | 50.0 |
| 14 | Grado B | Hombres | 55.5 |
| 15 | Grado A | Hombres | 75.9 |
import plotly.express as px
fig = px.line(df_investigators, x="Nivel de la carrera investigadora", y="Porcentaje",
color='Sexo',
height=500, color_discrete_sequence=["#83539D", "#EF7949"])
fig.write_html("genero_investigadoras.html")
fig.show()
df_computer_use = pd.read_csv("2022-05-02-ta_resdig_dig_comp__isoc_ci_cfp_cu_computer use male.csv")
#df_computer_use_female = pd.read_csv("2022-05-02-ta_resdig_dig_comp__isoc_ci_cfp_cu_comuter use women.csv")
df_computer_use
| time | _geo | geo | value | unit | _unit | indic_is | _indic_is | ind_type | _ind_type | _flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010 | AT | Austria | 82.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Individuals who are born in another EU Member ... | CB_EU_FOR | NaN |
| 1 | 2011 | AT | Austria | 75.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Individuals who are born in another EU Member ... | CB_EU_FOR | NaN |
| 2 | 2012 | AT | Austria | 89.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Individuals who are born in another EU Member ... | CB_EU_FOR | NaN |
| 3 | 2013 | AT | Austria | 83.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Individuals who are born in another EU Member ... | CB_EU_FOR | NaN |
| 4 | 2014 | AT | Austria | 81.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Individuals who are born in another EU Member ... | CB_EU_FOR | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 184577 | 2008 | UK | United Kingdom | 70.0 | Percentage of individuals | PC_IND | Computer use: never | I_CUX | Individuals, 75 years old or more | Y75_MAX | NaN |
| 184578 | 2009 | UK | United Kingdom | 72.0 | Percentage of individuals | PC_IND | Computer use: never | I_CUX | Individuals, 75 years old or more | Y75_MAX | NaN |
| 184579 | 2010 | UK | United Kingdom | 66.0 | Percentage of individuals | PC_IND | Computer use: never | I_CUX | Individuals, 75 years old or more | Y75_MAX | NaN |
| 184580 | 2011 | UK | United Kingdom | 60.0 | Percentage of individuals | PC_IND | Computer use: never | I_CUX | Individuals, 75 years old or more | Y75_MAX | NaN |
| 184581 | 2012 | UK | United Kingdom | 58.0 | Percentage of individuals | PC_IND | Computer use: never | I_CUX | Individuals, 75 years old or more | Y75_MAX | NaN |
184582 rows × 11 columns
df_computer_use.indic_is.unique(), df_computer_use.ind_type.unique()
(array(['Last computer use: within last 3 months', 'Computer use: never',
'Last computer use: between 3 and 12 months ago',
'Individuals who have ever used a computer',
'Last computer use: within last 12 months',
'Individuals who used a computer more than a year ago'],
dtype=object),
array(['Individuals who are born in another EU Member State',
'Non-nationals', 'Individuals who are born in non-EU country',
'Individuals who are foreign-born',
'Individuals who are native-born',
'Nationals of another EU-Member State',
'Nationals of non-EU country', 'Nationals',
'Active labour force (employed and unemployed)',
'Females with low formal education',
'Females with medium formal education',
'Females with high formal education',
'Females, 16 to 19 years old', 'Females, 16 to 24 years old',
'Females, 16 to 29 years old', 'Females, 16 to 74 years old',
'Females, 20 to 24 years old', 'Females, 25 to 29 years old',
'Females 25 to 54 years old', 'Females, 25 to 64 years old',
'Females 55 to 74 years old',
'Individual living in a household with income in first quartile',
'Individual living in a household with income in second quartile',
'Individual living in a household with income in third quartile',
'Individual living in a household with income in fourth quartile',
'Individuals with no or low formal education',
'Individuals with medium formal education',
'Individuals with high formal education',
'Individuals living in a household with broadband access',
'Individuals living in a household with Internet access but with no broadband access',
'Individuals living in a household with children',
'Individuals living in densely-populated area (at least 500 inhabitants/Km²)',
'Individuals living in intermediate urbanized area (between 100 and 499 inhabitants/Km²)',
'Individuals living in sparsely populated area (less than 100 inhabitants/Km²)',
"Individuals living in Not Objective 1 regions / 'Regional Competitiveness and Employment' Region",
'Individuals living in a household without children',
"Individuals living in Objective 1 regions / 'Convergence' regions",
'All Individuals', 'Non-manual including the armed forces',
'Manual', 'ICT professionals', 'Non ICT professionals',
'Mobile internet users', 'Non-users of mobile internet',
'Males with low formal education',
'Males with medium formal education',
'Males with high formal education', 'Males, 16 to 19 years old',
'Males, 16 to 24 years old', 'Males, 16 to 29 years old',
'Males, 16 to 74 years old', 'Males, 20 to 24 years old',
'Males, 25 to 29 years old', 'Males 25 to 54 years old',
'Males, 25 to 64 years old', 'Males 55 to 74 years old',
'Retired Individuals', 'Retired and other inactive',
'Individuals with at least one of the 3 following characteristics: 55 to 74 years old; low education; unemployed or inactive or retired',
'Individuals with at least two of the 3 following characteristics: 55 to 74 years old; low education; unemployed or inactive or retired',
'Employees', 'Employees, self-employed, family workers',
'Self-employed, family workers', 'Students', 'Unemployed',
'Individuals, 15 years old or less',
'Individuals, 16 to 19 years old',
'Individuals, 16 to 24 years old',
'Individuals aged 16-24 with high formal education',
'Individuals aged 16-24 with low education',
'Individuals aged 16-24 with medium formal education',
'Individuals, 16 to 29 years old',
'Individuals aged 16-29 with high formal education',
'Individuals aged 16-29 with low formal education',
'Individuals aged 16-29 with medium formal education',
'Individuals, 20 to 24 years old',
'Individuals, 25 to 29 years old',
'Individuals, 25 to 34 years old',
'Individuals, 25 to 54 years old',
'Individuals aged 25 to 54 with high formal education',
'Individuals aged 25 to 54 with low formal education',
'Individuals aged 25 to 54 with medium formal education',
'Individuals, 25 to 64 years old',
'Individuals aged 25 to 64 with high formal education',
'Individuals aged 25 to 64 with low formal education',
'Individuals aged 25 to 64 with medium formal education',
'Individuals aged 25 to 64 who are in the active labour force (employed and unemployed)',
'Individuals aged 25 to 64 who are retired or other inactive',
'Individuals aged 25 to 64 who are employees, self-employed or family workers',
'Individuals aged 25 to 64 who are unemployed',
'Individuals, 35 to 44 years old',
'Individuals, 45 to 54 years old',
'Individuals, 55 to 64 years old',
'Individuals, 55 to 74 years old',
'Individuals aged 55 to 74 with high formal education',
'Individuals aged 55 to 74 with low formal education',
'Individuals aged 55 to 74 with medium formal education',
'Individuals, 65 to 74 years old',
'Individuals, 75 years old or more'], dtype=object))
df_computer_use_male = df_computer_use[(df_computer_use.ind_type.str.contains("Males")) &
(df_computer_use.ind_type.str.contains("years old")) &
(df_computer_use.indic_is=="Last computer use: within last 3 months")]
df_computer_use_male["Sexo"] = "Hombre"
df_computer_use_female = df_computer_use[(df_computer_use.ind_type.str.contains("Females")) &
(df_computer_use.ind_type.str.contains("years old")) &
(df_computer_use.indic_is=="Last computer use: within last 3 months")]
df_computer_use_female["Sexo"] = "Mujer"
computer_use_cleaned = pd.concat([df_computer_use_male, df_computer_use_female])
computer_use_cleaned["ind_type_new"] = computer_use_cleaned._ind_type.str.slice(start=-5, stop=8).str.replace("_", "-")
computer_use_cleaned
/tmp/ipykernel_727/3118441712.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /tmp/ipykernel_727/3118441712.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| time | _geo | geo | value | unit | _unit | indic_is | _indic_is | ind_type | _ind_type | _flag | Sexo | ind_type_new | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15342 | 2011 | AT | Austria | 99.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Males, 16 to 19 years old | M_Y16_19 | NaN | Hombre | 16-19 |
| 15343 | 2012 | AT | Austria | 100.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Males, 16 to 19 years old | M_Y16_19 | NaN | Hombre | 16-19 |
| 15344 | 2013 | AT | Austria | 100.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Males, 16 to 19 years old | M_Y16_19 | NaN | Hombre | 16-19 |
| 15345 | 2014 | AT | Austria | 100.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Males, 16 to 19 years old | M_Y16_19 | NaN | Hombre | 16-19 |
| 15346 | 2015 | AT | Austria | 99.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Males, 16 to 19 years old | M_Y16_19 | NaN | Hombre | 16-19 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6489 | 2013 | UK | United Kingdom | 72.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Females 55 to 74 years old | F_Y55_74 | NaN | Mujer | 55-74 |
| 6490 | 2014 | UK | United Kingdom | 77.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Females 55 to 74 years old | F_Y55_74 | NaN | Mujer | 55-74 |
| 6491 | 2015 | UK | United Kingdom | 78.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Females 55 to 74 years old | F_Y55_74 | NaN | Mujer | 55-74 |
| 6492 | 2017 | UK | United Kingdom | 84.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Females 55 to 74 years old | F_Y55_74 | NaN | Mujer | 55-74 |
| 6493 | 2017 | XK | Kosovo | 13.0 | Percentage of individuals | PC_IND | Last computer use: within last 3 months | I_C3 | Females 55 to 74 years old | F_Y55_74 | NaN | Mujer | 55-74 |
6038 rows × 13 columns
spain_xy = computer_use_cleaned[(computer_use_cleaned.geo=="Spain") &
(computer_use_cleaned.time==2017)][["ind_type_new", "value", "Sexo"]].values
spain_xy[:, 0], spain_xy[:, 1], spain_xy[:, 2]
(array(['16-19', '16-24', '16-29', '16-74', '20-24', '25-29', '25-54',
'25-64', '55-74', '16-19', '16-24', '16-29', '16-74', '20-24',
'25-29', '25-54', '25-64', '55-74'], dtype=object),
array([95.0, 95.0, 93.0, 77.0, 95.0, 89.0, 83.0, 79.0, 55.0, 91.0, 92.0,
89.0, 71.0, 93.0, 84.0, 82.0, 76.0, 44.0], dtype=object),
array(['Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre',
'Hombre', 'Hombre', 'Hombre', 'Mujer', 'Mujer', 'Mujer', 'Mujer',
'Mujer', 'Mujer', 'Mujer', 'Mujer', 'Mujer'], dtype=object))
color_list = []
for item in spain_xy[:, 2]:
if item == "Hombre":
color_list.append("#EF7949")
else:
color_list.append("#83539D")
color_list
['#EF7949', '#EF7949', '#EF7949', '#EF7949', '#EF7949', '#EF7949', '#EF7949', '#EF7949', '#EF7949', '#83539D', '#83539D', '#83539D', '#83539D', '#83539D', '#83539D', '#83539D', '#83539D', '#83539D']
import plotly.express as px
import plotly.graph_objects as go
fig = px.violin(computer_use_cleaned[computer_use_cleaned.time==2017], x="ind_type_new", y="value",
color='Sexo', hover_data=["geo"], points='all',
#facet_col="time", facet_col_wrap=2,
height=600, width=1000,
color_discrete_sequence=["#EF7949", "#83539D"],
labels={
"value": "Porcentaje",
"ind_type_new": "Grupos de Edad",
})
fig.add_trace(go.Scatter(x=spain_xy[:, 0], y=spain_xy[:, 1], mode = 'markers',
marker_size = 20, name="Spain", opacity=0.8,
marker_color=color_list,))
fig.write_html("computer_use.html")
fig.show()
df_internet_use = pd.read_csv("2022-05-02-ta_resdig_dig_intuse__isoc_ci_ifp_iu_individuals_internet used_female.csv")
df_internet_use_male = df_internet_use[(df_internet_use.ind_type.str.contains("Males")) &
(df_internet_use.ind_type.str.contains("years old")) &
(df_internet_use.indic_is=='Last Internet use: in last 3 months')]
df_internet_use_male["Sexo"] = "Hombre"
df_internet_use_female = df_internet_use[(df_internet_use.ind_type.str.contains("Females")) &
(df_internet_use.ind_type.str.contains("years old")) &
(df_internet_use.indic_is=='Last Internet use: in last 3 months')]
df_internet_use_female["Sexo"] = "Mujer"
internet_use_cleaned = pd.concat([df_internet_use_male, df_internet_use_female])
internet_use_cleaned["ind_type_new"] = internet_use_cleaned._ind_type.str.slice(start=-5, stop=8).str.replace("_", "-")
internet_use_cleaned
/tmp/ipykernel_727/2861403796.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /tmp/ipykernel_727/2861403796.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| time | _geo | geo | _flag | indic_is | _indic_is | unit | _unit | ind_type | _ind_type | value | Sexo | ind_type_new | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 219659 | 2018 | AL | Albania | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Males, 16 to 19 years old | M_Y16_19 | 90.0 | Hombre | 16-19 |
| 219660 | 2019 | AL | Albania | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Males, 16 to 19 years old | M_Y16_19 | 96.0 | Hombre | 16-19 |
| 219661 | 2020 | AL | Albania | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Males, 16 to 19 years old | M_Y16_19 | 95.0 | Hombre | 16-19 |
| 219662 | 2021 | AL | Albania | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Males, 16 to 19 years old | M_Y16_19 | 97.0 | Hombre | 16-19 |
| 219663 | 2011 | AT | Austria | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Males, 16 to 19 years old | M_Y16_19 | 98.0 | Hombre | 16-19 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 206804 | 2020 | UK | United Kingdom | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Females 55 to 74 years old | F_Y55_74 | 92.0 | Mujer | 55-74 |
| 206805 | 2017 | XK | Kosovo | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Females 55 to 74 years old | F_Y55_74 | 61.0 | Mujer | 55-74 |
| 206806 | 2018 | XK | Kosovo | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Females 55 to 74 years old | F_Y55_74 | 72.0 | Mujer | 55-74 |
| 206807 | 2019 | XK | Kosovo | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Females 55 to 74 years old | F_Y55_74 | 80.0 | Mujer | 55-74 |
| 206808 | 2020 | XK | Kosovo | NaN | Last Internet use: in last 3 months | I_IU3 | Percentage of individuals | PC_IND | Females 55 to 74 years old | F_Y55_74 | 89.0 | Mujer | 55-74 |
9481 rows × 13 columns
spain_xy = internet_use_cleaned[(internet_use_cleaned.geo=="Spain") &
(internet_use_cleaned.time==2017)][["ind_type_new", "value", "Sexo"]].values
color_list = []
for item in spain_xy[:, 2]:
if item == "Hombre":
color_list.append("#EF7949")
else:
color_list.append("#83539D")
color_list, spain_xy[:, 0], spain_xy[:, 1], spain_xy[:, 2]
(['#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#EF7949',
'#83539D',
'#83539D',
'#83539D',
'#83539D',
'#83539D',
'#83539D',
'#83539D',
'#83539D',
'#83539D'],
array(['16-19', '16-24', '16-29', '16-74', '20-24', '25-29', '25-54',
'25-64', '55-74', '16-19', '16-24', '16-29', '16-74', '20-24',
'25-29', '25-54', '25-64', '55-74'], dtype=object),
array([98.0, 98.0, 98.0, 85.0, 98.0, 98.0, 94.0, 90.0, 63.0, 96.0, 98.0,
97.0, 84.0, 100.0, 95.0, 94.0, 89.0, 59.0], dtype=object),
array(['Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre',
'Hombre', 'Hombre', 'Hombre', 'Mujer', 'Mujer', 'Mujer', 'Mujer',
'Mujer', 'Mujer', 'Mujer', 'Mujer', 'Mujer'], dtype=object))
fig = px.violin(internet_use_cleaned[internet_use_cleaned.time==2017], x="ind_type_new", y="value",
color='Sexo', hover_data=["geo"], points='all',
#facet_col="time", facet_col_wrap=2,
height=600, width=1000,
color_discrete_sequence=["#EF7949", "#83539D"], labels={
"value": "Porcentaje",
"ind_type_new": "Grupos de Edad",
})
fig.add_trace(go.Scatter(x=spain_xy[:, 0], y=spain_xy[:, 1], mode = 'markers',
marker_size = 20, name="Spain", opacity=0.8,
marker_color=color_list,))
fig.write_html("internet_use.html")
fig.show()